Aluno: Felipe Martim Fernandes Vieira
O objetivo desse trabalho é fazer a análise exploratória de uma base de dados de vinhos de uma região de Portugal, e construir modelos utilizando as técnicas aprendidas em sala de aula. As diferentes etapas do trabalho seguem abaixo:
library(dplyr)
library(corrgram)
library('GGally')
library(plotly)
library(caret)
library(e1071)
library(lmtest)
library(DAAG)
library(rpart)
library(rpart.plot)
library(rattle)
library(tclust)
library(cluster)
library(fpc)
library(ROCR)
Loading required package: gplots
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
setwd('~/workspace/fiap/wine')
wines <- read.csv2(file="BaseWine_Red_e_White.csv"
, header=TRUE
, sep=";")
str(wines)
'data.frame': 6497 obs. of 14 variables:
$ id_vinho : int 1 2 3 4 5 6 7 8 9 10 ...
$ fixedacidity : num 6.6 6.7 10.6 5.4 6.7 6.8 6.6 7.2 5.1 6.2 ...
$ volatileacidity : num 0.24 0.34 0.31 0.18 0.3 0.5 0.61 0.66 0.26 0.22 ...
$ citricacid : num 0.35 0.43 0.49 0.24 0.44 0.11 0 0.33 0.33 0.2 ...
$ residualsugar : num 7.7 1.6 2.2 4.8 18.8 ...
$ chlorides : num 0.031 0.041 0.063 0.041 0.057 0.075 0.069 0.068 0.027 0.035 ...
$ freesulfurdioxide : num 36 29 18 30 65 16 4 34 46 58 ...
$ totalsulfurdioxide: num 135 114 40 113 224 49 8 102 113 184 ...
$ density : num 0.994 0.99 0.998 0.994 1 ...
$ pH : num 3.19 3.23 3.14 3.42 3.11 3.36 3.33 3.27 3.35 3.11 ...
$ sulphates : num 0.37 0.44 0.51 0.4 0.53 0.79 0.37 0.78 0.43 0.53 ...
$ alcohol : num 10.5 12.6 9.8 9.4 9.1 9.5 10.4 12.8 11.4 9 ...
$ quality : int 5 6 6 6 5 5 4 6 7 6 ...
$ Vinho : Factor w/ 2 levels "RED","WHITE": 2 2 1 2 2 1 1 1 2 2 ...
summary(wines)
id_vinho fixedacidity volatileacidity citricacid residualsugar chlorides freesulfurdioxide
Min. : 1 Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.60 Min. :0.00900 Min. : 1.00
1st Qu.:1625 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.80 1st Qu.:0.03800 1st Qu.: 17.00
Median :3249 Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.00 Median :0.04700 Median : 29.00
Mean :3249 Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.44 Mean :0.05603 Mean : 30.53
3rd Qu.:4873 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.10 3rd Qu.:0.06500 3rd Qu.: 41.00
Max. :6497 Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :45.80 Max. :0.61100 Max. :289.00
totalsulfurdioxide density pH sulphates alcohol quality Vinho
Min. : 6.0 Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 0.9567 Min. :3.000 RED :1599
1st Qu.: 77.0 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5000 1st Qu.:5.000 WHITE:4898
Median :118.0 Median :0.9949 Median :3.210 Median :0.5100 Median :10.3000 Median :6.000
Mean :115.7 Mean :0.9947 Mean :3.219 Mean :0.5313 Mean :10.4862 Mean :5.818
3rd Qu.:156.0 3rd Qu.:0.9970 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.3000 3rd Qu.:6.000
Max. :440.0 Max. :1.0140 Max. :4.010 Max. :2.0000 Max. :14.9000 Max. :9.000
Verificando se há valores faltantes
sum(is.na(wines))
[1] 0
A variável id_vinho não é necessária para a nossa análise e será removida. Para manter a consistência na nomenclatura, renomeei Vinho para type.
wines_adjusted <- wines %>% select(-id_vinho) %>% rename(type = Vinho)
p1 <- wines_adjusted %>% ggplot(aes(x = chlorides)) +
geom_histogram(bins = 40, fill = 'lightblue')
p2 <- wines_adjusted %>% ggplot(aes(x = sulphates)) +
geom_histogram(bins = 40, fill = 'lightblue')
p3 <- wines_adjusted %>% ggplot(aes(x = fixedacidity)) +
geom_histogram(bins = 40, fill = 'lightblue')
p4 <- wines_adjusted %>% ggplot(aes(x = freesulfurdioxide)) +
geom_histogram(bins = 40, fill = 'lightblue')
p5 <- wines_adjusted %>% ggplot(aes(x = alcohol)) +
geom_histogram(bins = 40, fill = 'lightblue')
p6 <- wines_adjusted %>% ggplot(aes(x = volatileacidity)) +
geom_histogram(bins = 40, fill = 'lightblue')
p7 <- wines_adjusted %>% ggplot(aes(x = totalsulfurdioxide)) +
geom_histogram(bins = 40, fill = 'lightblue')
p8 <- wines_adjusted %>% ggplot(aes(x = citricacid)) +
geom_histogram(bins = 40, fill = 'lightblue')
p9 <- wines_adjusted %>% ggplot(aes(x = density)) +
geom_histogram(bins = 40, fill = 'lightblue')
p10 <- wines_adjusted %>% ggplot(aes(x = pH)) +
geom_histogram(bins = 40, fill = 'lightblue')
p11 <- wines_adjusted %>% ggplot(aes(x = residualsugar)) +
geom_histogram(bins = 40, fill = 'lightblue')
p12 <- wines_adjusted %>% ggplot(aes(x = quality)) +
geom_histogram(bins = 6, fill = 'lightblue')
multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, cols = 3 )
Ao visualizar os boxplots abaixo, é possível verificar que existem inúmeros outliers, que provavelmente serão removidos para a modelagem.
par (mfrow=c(2,2))
boxplot(wines_adjusted$chlorides, main='chlorides')
boxplot(wines_adjusted$sulphates, main='sulphates')
boxplot(wines_adjusted$fixedacidity, main='fixedacidity')
boxplot(wines_adjusted$residualsugar, main='residualsugar')
boxplot(wines_adjusted$freesulfurdioxide, main='freesulfurdioxide')
boxplot(wines_adjusted$alcohol, main='alcohol')
boxplot(wines_adjusted$volatileacidity, main='volatileacidity')
boxplot(wines_adjusted$totalsulfurdioxide, main='totalsulfurdioxide')
boxplot(wines_adjusted$quality, main='quality')
boxplot(wines_adjusted$citricacid, main='citricacid')
boxplot(wines_adjusted$density, main='density')
boxplot(wines_adjusted$pH, main='pH')
par (mfrow=c(1,1))
wines_padr <- preProcess(wines_adjusted[,1:11], c("center", "scale")) %>%
predict(., wines_adjusted) %>%
data.frame(trans = .)
colnames(wines_padr) <- colnames(wines_adjusted)
str(wines_padr)
'data.frame': 6497 obs. of 14 variables:
$ fixedacidity : num -0.475 -0.397 2.611 -1.4 -0.397 ...
$ volatileacidity : num -0.60537 0.00203 -0.18019 -0.96981 -0.24093 ...
$ citricacid : num 0.216 0.766 1.179 -0.541 0.835 ...
$ residualsugar : num 0.478 -0.813 -0.686 -0.135 2.817 ...
$ chlorides : num -0.7146 -0.4291 0.1988 -0.4291 0.0276 ...
$ freesulfurdioxide : num 0.3084 -0.0859 -0.7057 -0.0296 1.9423 ...
$ totalsulfurdioxide: num 0.3407 -0.0309 -1.3401 -0.0486 1.9153 ...
$ density : num -0.3019 -1.5394 0.983 -0.0821 1.6457 ...
$ pH : num -0.1773 0.0715 -0.4882 1.2532 -0.6748 ...
$ sulphates : num -1.08375 -0.61334 -0.14293 -0.88214 -0.00852 ...
$ alcohol : num 0.0114 1.739 -0.5645 -0.8936 -1.1404 ...
$ quality : int 5 6 6 6 5 5 4 6 7 6 ...
$ type : Factor w/ 2 levels "RED","WHITE": 2 2 1 2 2 1 1 1 2 2 ...
$ good : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
summary(wines_padr)
fixedacidity volatileacidity citricacid residualsugar chlorides freesulfurdioxide
Min. :-2.6344 Min. :-1.5772 Min. :-2.19266 Min. :-1.0243 Min. :-1.3425 Min. :-1.66345
1st Qu.:-0.6289 1st Qu.:-0.6661 1st Qu.:-0.47230 1st Qu.:-0.7704 1st Qu.:-0.5148 1st Qu.:-0.76202
Median :-0.1661 Median :-0.3017 Median :-0.05941 Median :-0.5164 Median :-0.2579 Median :-0.08594
Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
3rd Qu.: 0.3739 3rd Qu.: 0.3665 3rd Qu.: 0.49111 3rd Qu.: 0.5629 3rd Qu.: 0.2559 3rd Qu.: 0.59014
Max. : 6.6989 Max. : 7.5338 Max. : 9.23057 Max. : 8.5416 Max. :15.8410 Max. :14.56245
totalsulfurdioxide density pH sulphates alcohol quality
Min. :-1.9416 Min. :-2.56383 Min. :-3.10038 Min. :-2.0918 Min. :-7.8396 Min. :3.000
1st Qu.:-0.6855 1st Qu.:-0.79551 1st Qu.:-0.67481 1st Qu.:-0.6805 1st Qu.:-0.8113 1st Qu.:5.000
Median : 0.0399 Median : 0.06668 Median :-0.05287 Median :-0.1429 Median :-0.1532 Median :6.000
Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean :5.818
3rd Qu.: 0.7122 3rd Qu.: 0.77672 3rd Qu.: 0.63126 3rd Qu.: 0.4619 3rd Qu.: 0.6695 3rd Qu.:6.000
Max. : 5.7368 Max. : 6.52124 Max. : 4.92265 Max. : 9.8701 Max. : 3.6311 Max. :9.000
type good
RED :1599 0:2384
WHITE:4898 1:4113
Para a remoção dos outliers, foi utilizado o método Z-Score. Os valores com mais de 3 desvios padrões foram removidos.
wines_padr <- wines_padr[!abs(wines_padr$chlorides) > 3,] %>%
.[!abs(.$sulphates) > 3,] %>%
.[!abs(.$fixedacidity) > 3,] %>%
.[!abs(.$residualsugar) > 3,] %>%
.[!abs(.$freesulfurdioxide) > 3,] %>%
.[!abs(.$alcohol) > 3,] %>%
.[!abs(.$volatileacidity) > 3,] %>%
.[!abs(.$totalsulfurdioxide) > 3,] %>%
.[!abs(.$citricacid) > 3,] %>%
.[!abs(.$density) > 3,] %>%
.[!abs(.$pH) > 3,]
str(wines_padr)
'data.frame': 6007 obs. of 13 variables:
$ fixedacidity : num -0.475 -0.397 2.611 -1.4 -0.397 ...
$ volatileacidity : num -0.60537 0.00203 -0.18019 -0.96981 -0.24093 ...
$ citricacid : num 0.216 0.766 1.179 -0.541 0.835 ...
$ residualsugar : num 0.478 -0.813 -0.686 -0.135 2.817 ...
$ chlorides : num -0.7146 -0.4291 0.1988 -0.4291 0.0276 ...
$ freesulfurdioxide : num 0.3084 -0.0859 -0.7057 -0.0296 1.9423 ...
$ totalsulfurdioxide: num 0.3407 -0.0309 -1.3401 -0.0486 1.9153 ...
$ density : num -0.3019 -1.5394 0.983 -0.0821 1.6457 ...
$ pH : num -0.1773 0.0715 -0.4882 1.2532 -0.6748 ...
$ sulphates : num -1.08375 -0.61334 -0.14293 -0.88214 -0.00852 ...
$ alcohol : num 0.0114 1.739 -0.5645 -0.8936 -1.1404 ...
$ quality : int 5 6 6 6 5 5 4 6 7 5 ...
$ type : Factor w/ 2 levels "RED","WHITE": 2 2 1 2 2 1 1 1 2 2 ...
summary(wines_padr)
fixedacidity volatileacidity citricacid residualsugar chlorides freesulfurdioxide totalsulfurdioxide
Min. :-2.55725 Min. :-1.57721 Min. :-2.19266 Min. :-1.02435 Min. :-1.34254 Min. :-1.66345 Min. :-1.94163
1st Qu.:-0.62888 1st Qu.:-0.72685 1st Qu.:-0.47230 1st Qu.:-0.77039 1st Qu.:-0.54330 1st Qu.:-0.70568 1st Qu.:-0.56163
Median :-0.24321 Median :-0.30167 Median :-0.05941 Median :-0.47410 Median :-0.28641 Median :-0.08594 Median : 0.07529
Mean :-0.09333 Mean :-0.08095 Mean :-0.03948 Mean : 0.01329 Mean :-0.12702 Mean : 0.01057 Mean : 0.04015
3rd Qu.: 0.29673 3rd Qu.: 0.30573 3rd Qu.: 0.42229 3rd Qu.: 0.58408 3rd Qu.: 0.08467 3rd Qu.: 0.64648 3rd Qu.: 0.71221
Max. : 2.99645 Max. : 2.97828 Max. : 2.89962 Max. : 2.97556 Max. : 2.99616 Max. : 2.95642 Max. : 2.94144
density pH sulphates alcohol quality type
Min. :-2.563832 Min. :-2.9759884 Min. :-2.09177 Min. :-2.0453 Min. :3.00 RED :1277
1st Qu.:-0.856366 1st Qu.:-0.6748102 1st Qu.:-0.68054 1st Qu.:-0.8113 1st Qu.:5.00 WHITE:4730
Median : 0.002439 Median :-0.0528702 Median :-0.21013 Median :-0.1532 Median :6.00
Mean :-0.065188 Mean :-0.0002015 Mean :-0.08039 Mean : 0.0185 Mean :5.84
3rd Qu.: 0.712475 3rd Qu.: 0.6312639 3rd Qu.: 0.39469 3rd Qu.: 0.6695 3rd Qu.:6.00
Max. : 2.673526 Max. : 2.9946361 Max. : 2.94835 Max. : 2.9319 Max. :9.00
wines_padr %>%
select(-type) %>%
ggcorr(method = c("pairwise","spearman"), label = FALSE, angle = -0, hjust = 0.2) +
coord_flip()
Coordinate system already present. Adding new coordinate system, which will replace the existing one.
wines_adjusted %>%
select(-citricacid, -totalsulfurdioxide) %>%
ggpairs()
p1 <- wines_padr %>%
ggplot(aes(x = density, y = alcohol, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
p2 <- wines_padr %>%
ggplot(aes(x = density, y = fixedacidity, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
p3 <- wines_padr %>%
ggplot(aes(x = volatileacidity, y = quality, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
p4 <- wines_padr %>%
ggplot(aes(x = alcohol, y = quality, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
p5 <- wines_padr %>%
ggplot(aes(x = residualsugar, y = alcohol, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
p6 <- wines_padr %>%
ggplot(aes(x = density, y = chlorides, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
multiplot(p1, p2, p3, p4, p5, p6, cols = 2 )
Utilizarei 80% dos dados para treino, e 20% para teste.
set.seed(32312)
train_ind <- floor(0.8 * nrow(wines_padr)) %>%
sample(seq_len(nrow(wines_padr)), size = .)
train <- wines_padr[train_ind, ]
test <- wines_padr[-train_ind, ]
Testando a regressão linear com todas as variáveis.
lm_01 <- lm(quality ~ fixedacidity + volatileacidity
+ citricacid + residualsugar + chlorides
+ freesulfurdioxide + totalsulfurdioxide
+ density + pH + sulphates + alcohol + type, data=train)
summary(lm_01)
Call:
lm(formula = quality ~ fixedacidity + volatileacidity + citricacid +
residualsugar + chlorides + freesulfurdioxide + totalsulfurdioxide +
density + pH + sulphates + alcohol + type, data = train)
Residuals:
Min 1Q Median 3Q Max
-3.3562 -0.4791 -0.0462 0.4492 3.0019
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.14330 0.05492 111.856 < 2e-16 ***
fixedacidity 0.16998 0.02777 6.121 1.00e-09 ***
volatileacidity -0.22871 0.01724 -13.267 < 2e-16 ***
citricacid -0.01724 0.01408 -1.224 0.22084
residualsugar 0.35439 0.03726 9.510 < 2e-16 ***
chlorides -0.02325 0.02737 -0.849 0.39566
freesulfurdioxide 0.12012 0.01689 7.112 1.31e-12 ***
totalsulfurdioxide -0.06493 0.02159 -3.007 0.00265 **
density -0.42574 0.06051 -7.036 2.25e-12 ***
pH 0.11342 0.01820 6.232 5.01e-10 ***
sulphates 0.12198 0.01495 8.159 4.26e-16 ***
alcohol 0.22863 0.03046 7.506 7.23e-14 ***
typeWHITE -0.42814 0.07238 -5.915 3.55e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7247 on 4792 degrees of freedom
Multiple R-squared: 0.2961, Adjusted R-squared: 0.2943
F-statistic: 168 on 12 and 4792 DF, p-value: < 2.2e-16
Pelos resultados acima, é possível concluir que o modelo linear é estatisticamente significante, devido o p-value ser inferior a 0,05. Entretanto há variáveis que não são significantes, e podem ser removidas.
Como vimos na seção anterior, vamos remover citricacid, chlorides e totalsulfurdioxide e criar um novo modelo. Como vimos durante a análise exploratória, há uma correlação muito forte entre totalsulfurdioxide e freesulfurdioxide. Por esse motivo, utilizaremos somente uma delas.
lm_02 <- lm(quality ~ fixedacidity + volatileacidity
+ residualsugar + freesulfurdioxide + density
+ pH + sulphates + alcohol + type, data=train)
summary(lm_02)
Call:
lm(formula = quality ~ fixedacidity + volatileacidity + residualsugar +
freesulfurdioxide + density + pH + sulphates + alcohol +
type, data = train)
Residuals:
Min 1Q Median 3Q Max
-3.3258 -0.4823 -0.0465 0.4566 3.0180
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.23068 0.04580 136.041 < 2e-16 ***
fixedacidity 0.17508 0.02706 6.470 1.08e-10 ***
volatileacidity -0.23022 0.01597 -14.412 < 2e-16 ***
residualsugar 0.37796 0.03602 10.494 < 2e-16 ***
freesulfurdioxide 0.08877 0.01368 6.489 9.53e-11 ***
density -0.47691 0.05796 -8.228 2.43e-16 ***
pH 0.12157 0.01787 6.802 1.16e-11 ***
sulphates 0.11968 0.01492 8.024 1.28e-15 ***
alcohol 0.21846 0.03024 7.224 5.85e-13 ***
typeWHITE -0.54185 0.05929 -9.138 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7254 on 4795 degrees of freedom
Multiple R-squared: 0.2943, Adjusted R-squared: 0.2929
F-statistic: 222.1 on 9 and 4795 DF, p-value: < 2.2e-16
Apesar de todas as variáveis independentes possuírem um baixo p-value, o nosso R-Squared se encontra um pouco baixo. Não descartarei necessariamente o modelo devido a um valor pequeno do R quadrado. Nesse caso é melhor verificar a acurácia da predição nos dados de teste.
qualityPredict <- predict(lm_02, test, interval = "prediction", level = 0.95)
mse <- mean((test$quality - qualityPredict[,1])^2)
sqrt(mse)
[1] 0.7274402
erro_usando_media <- mean((test$quality - mean(test$quality))^2)
sqrt(erro_usando_media)
[1] 0.8691678
actuals_preds <- data.frame(cbind(actuals=test$quality, predicteds=qualityPredict[,1]))
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
actuals predicteds
actuals 1.000000 0.547456
predicteds 0.547456 1.000000
min_max_accuracy <- mean(apply(actuals_preds, 1, min) / apply(actuals_preds, 1, max))
min_max_accuracy
[1] 0.9109673
mape <- mean(abs((actuals_preds$predicteds - actuals_preds$actuals))/actuals_preds$actuals)
mape
[1] 0.7221432
É possível notar que os resíduos estão dispersos aleatoriamente em torno de zero, com variância constante.
lm_02 %>%
resid() %>%
plot(predict(lm_02), ., xlab = "Preditor linear", ylab = "Residuos")
abline(h = 0, lty = 2)
lm_02 %>%
resid() %>%
hist(main='Histograma dos resíduos')
qqnorm(residuals(lm_02), ylab="Resíduos",xlab="Quantis teóricos",main="")
qqline(residuals(lm_02))
shapiro.test(sample(residuals(lm_02), size = 4800))
Shapiro-Wilk normality test
data: sample(residuals(lm_02), size = 4800)
W = 0.99177, p-value = 4.439e-16
suppressWarnings(CVlm(data=wines_padr,
form.lm = quality ~ fixedacidity + volatileacidity
+ residualsugar + freesulfurdioxide + density
+ pH + sulphates + alcohol + type, m=5, dots=TRUE, seed=29, legend.pos="topleft", printit=FALSE));
Para comparar com a regressão linear, foi feito o modelo de árvore de regressão.
regression_tree <- rpart (quality ~ fixedacidity + volatileacidity
+ citricacid + residualsugar + chlorides
+ freesulfurdioxide + totalsulfurdioxide
+ density + pH + sulphates + alcohol + type, data=train,
cp = 0.001, minsplit = 15, maxdepth=15)
rpart.plot(regression_tree, type=4, extra=1, under=FALSE, clip.right.labs=TRUE,
fallen.leaves=FALSE, digits=2, varlen=-10, faclen=20,
cex=0.4, tweak=1.7,
compress=TRUE,box.palette="Grays",
snip=FALSE)
val_pred_regression_tree <- predict(regression_tree, test, interval = "prediction", level = 0.95)
mse_tree <- mean((test$quality - val_pred_regression_tree)^2)
sqrt(mse_tree)
[1] 0.7261796
actuals_preds <- data.frame(cbind(actuals=test$quality, predicteds=val_pred_regression_tree))
correlation_accuracy <- cor(actuals_preds)
correlation_accuracy
actuals predicteds
actuals 1.0000000 0.5740011
predicteds 0.5740011 1.0000000
rs <- val_pred_regression_tree - test$quality
plot(predict(regression_tree, test), rs, xlab = "Com Árvore de Regressão", ylab = "Residuos")
abline(h = 0, lty = 2)
hist(rs)
qqnorm(rs, ylab="Resíduos",xlab="Quantis teóricos",main="")
qqline(rs)
Embora com resultados muito parecidos, a técnica de árvore de regressão teve uma acurácia um pouco maior. É possível que o decepcionante resultado das duas técnicas se deve ao tratamento dos vinhos tintos juntamente com os vinhos brancos. Como deu para perceber pela análise exploratória, os dois tipos de vinhos possuem algumas características físico-químicas com comportamentos diferentes.
wines_padr$good <- as.factor(ifelse(wines_padr$quality >= 6,1,0))
summary(wines_padr$good)
0 1
2384 4113
train <- wines_padr[train_ind, ]
test <- wines_padr[-train_ind, ]
decision_tree <- rpart (train$good ~ fixedacidity + volatileacidity
+ citricacid + residualsugar + chlorides
+ freesulfurdioxide + totalsulfurdioxide
+ density + pH + sulphates + alcohol + type,
data = train)
fancyRpartPlot(decision_tree)
decision_tree_predict <- predict(decision_tree, test, type='class')
confusion_matrix <- table(test$good, decision_tree_predict)
confusion_matrix
decision_tree_predict
0 1
0 335 289
1 165 903
diagonal <- diag(confusion_matrix)
Acc <- sum(diagonal)/sum(confusion_matrix)
Acc
[1] 0.7512479
log_01 <- glm(train$good ~ fixedacidity + volatileacidity
+ citricacid + residualsugar + chlorides
+ freesulfurdioxide + totalsulfurdioxide
+ density + pH + sulphates + alcohol + type, train, family=binomial(link=logit))
summary(log_01)
Call:
glm(formula = train$good ~ fixedacidity + volatileacidity + citricacid +
residualsugar + chlorides + freesulfurdioxide + totalsulfurdioxide +
density + pH + sulphates + alcohol + type, family = binomial(link = logit),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7851 -0.9028 0.4281 0.8156 2.2976
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.45734 0.19125 7.620 2.54e-14 ***
fixedacidity 0.26705 0.09188 2.906 0.003656 **
volatileacidity -0.80868 0.06112 -13.232 < 2e-16 ***
citricacid -0.13495 0.04561 -2.959 0.003085 **
residualsugar 0.72859 0.12319 5.914 3.34e-09 ***
chlorides -0.03862 0.09021 -0.428 0.668548
freesulfurdioxide 0.33884 0.05720 5.924 3.14e-09 ***
totalsulfurdioxide -0.29247 0.07133 -4.100 4.13e-05 ***
density -0.70905 0.20046 -3.537 0.000404 ***
pH 0.16099 0.06049 2.661 0.007780 **
sulphates 0.40595 0.05437 7.467 8.21e-14 ***
alcohol 0.88727 0.10395 8.536 < 2e-16 ***
typeWHITE -0.93732 0.24913 -3.762 0.000168 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6266.8 on 4804 degrees of freedom
Residual deviance: 4912.8 on 4792 degrees of freedom
AIC: 4938.8
Number of Fisher Scoring iterations: 5
O segundo modelo foi feito removendo as variáveis que possuíam um p-value muito elevado no primeiro modelo.
log_02 <- glm(train$good ~ fixedacidity + volatileacidity
+ citricacid + residualsugar
+ freesulfurdioxide + totalsulfurdioxide
+ density + pH + sulphates + alcohol + type, train, family=binomial(link=logit))
summary(log_02)
Call:
glm(formula = train$good ~ fixedacidity + volatileacidity + citricacid +
residualsugar + freesulfurdioxide + totalsulfurdioxide +
density + pH + sulphates + alcohol + type, family = binomial(link = logit),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7821 -0.9000 0.4288 0.8189 2.3022
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.44283 0.18810 7.671 1.71e-14 ***
fixedacidity 0.27297 0.09083 3.005 0.002654 **
volatileacidity -0.81038 0.06099 -13.288 < 2e-16 ***
citricacid -0.13585 0.04555 -2.983 0.002858 **
residualsugar 0.74017 0.12018 6.159 7.34e-10 ***
freesulfurdioxide 0.33905 0.05719 5.928 3.06e-09 ***
totalsulfurdioxide -0.29319 0.07131 -4.112 3.93e-05 ***
density -0.72686 0.19607 -3.707 0.000210 ***
pH 0.16513 0.05971 2.766 0.005678 **
sulphates 0.40672 0.05434 7.485 7.14e-14 ***
alcohol 0.88616 0.10391 8.528 < 2e-16 ***
typeWHITE -0.91343 0.24263 -3.765 0.000167 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 6266.8 on 4804 degrees of freedom
Residual deviance: 4913.0 on 4793 degrees of freedom
AIC: 4937
Number of Fisher Scoring iterations: 5
logistic_regression_predict <- predict(log_02, test, type="response") %>%
cut(., breaks=c(0,0.50,1), right=F)
MC <- table(test$good, logistic_regression_predict , deparse.level = 2) # montar a matriz de confusão
show(MC)
logistic_regression_predict
test$good [0,0.5) [0.5,1)
0 257 181
1 126 638
ACC = sum(diag(MC))/sum(MC)
show(ACC)
[1] 0.7445923
p <- predict(log_02, test, type="response")
pr <- prediction(p, test$good)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.8135444
As técnicas de árvore de decisão e regressão logística apresentaram resultados muito similares. Em contraste com a primeira etapa, os modelos para previsão de uma variável categórica de qualidade tiveram uma acurácia muito maior.
wines_padr <- wines_padr %>%
select(-quality, -good, -type)
wss <- (nrow(wines_padr)-1)*sum(apply(wines_padr,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(wines_padr,
centers=i)$withinss)
did not converge in 10 iterations
plot(1:20, wss, type="b", xlab="Número de clusters",
ylab="Soma de quadrados intra-clusters")
Analisando o gráfico acima, optei por dividir os vinhos em 6 grupos distintos.
Identificando os grupos no Dendograma.
hier_cluster <- hclust(dist(wines_padr),method='ward.D2')
d <- dist(wines_padr, method = "euclidean")
plot(hier_cluster, ylab='distancia', cex=0.6)
groups <- cutree(hier_cluster, k=6) #Identificando os 6 diferentes grupos
rect.hclust(hier_cluster, k=6, border="red")
set.seed(1232)
output_cluster <- kmeans(wines_padr, 6, iter=100)
output_cluster
K-means clustering with 6 clusters of sizes 946, 1206, 1483, 1219, 646, 997
Cluster means:
fixedacidity volatileacidity citricacid residualsugar chlorides freesulfurdioxide totalsulfurdioxide density pH
1 0.08473360 1.6866294 -1.26213820 -0.6317944 0.6817667 -0.7986723 -1.16915557 0.5037673 0.96129972
2 -0.52520677 -0.2668938 -0.02511597 -0.4822657 -0.5802020 -0.1213941 -0.16797325 -1.3589390 -0.04482517
3 -0.17028943 -0.3492428 0.31064945 1.4659004 -0.1443988 0.9372052 0.99798842 0.9223711 -0.49867076
4 0.09639664 -0.4497086 0.23995436 -0.2777307 -0.2107031 -0.2721956 0.07332153 -0.4284112 -0.72419807
5 2.00659014 0.5019104 0.95725823 -0.5616504 1.2624821 -0.8942313 -1.25067366 1.0021260 -0.06731150
6 -0.60981357 -0.5333900 -0.14775699 -0.2941430 -0.2906727 0.4228183 0.54878266 -0.3316950 0.81291779
sulphates alcohol
1 0.40065402 -0.24807743
2 -0.30296210 1.39885650
3 -0.27728431 -0.84938155
4 -0.50528600 -0.08757578
5 1.42050025 0.02119780
6 0.09615516 -0.09994560
Clustering vector:
[1] 4 2 5 6 3 1 1 1 2 3 3 6 6 6 4 2 1 4 1 6 6 3 1 4 4 4 6 5 1 3 2 1 5 4 1 6 6 3 2 3 3 3 6 5 2 3 4 3 2 2 4 2 6 6 1 6 4 6 1 1 2 3
[63] 4 4 1 2 4 5 2 3 6 4 3 1 4 5 5 6 2 3 2 2 5 5 5 1 2 3 1 2 5 6 2 2 3 2 3 4 2 2 1 4 2 4 6 1 4 2 4 6 3 4 4 4 4 3 2 4 6 2 4 6 5 3
[125] 6 3 4 6 1 1 6 5 6 2 3 1 1 3 4 3 3 3 2 3 2 2 5 3 1 1 6 4 4 2 1 6 3 3 5 3 3 6 3 5 2 2 1 4 6 4 1 1 3 3 2 6 2 1 6 3 2 1 4 1 4 6
[187] 3 6 1 3 4 1 3 2 5 2 6 3 2 3 4 2 3 6 3 6 3 6 5 3 1 2 6 5 5 6 3 2 4 3 3 3 4 4 3 1 1 4 2 3 1 2 4 3 1 4 3 1 5 3 5 1 1 3 2 6 6 6
[249] 3 2 4 6 2 2 4 3 5 3 3 3 3 1 1 2 1 2 2 6 4 3 2 4 3 3 1 5 2 3 5 1 2 3 5 1 6 5 2 5 3 2 3 6 2 2 1 6 1 3 5 6 3 3 1 5 4 3 2 1 1 3
[311] 4 1 5 1 2 3 1 4 1 1 4 6 6 3 6 2 4 1 1 3 5 4 3 3 5 1 4 3 5 3 2 5 4 3 6 6 1 5 4 2 2 4 4 6 4 4 3 2 6 6 4 2 3 2 6 1 2 3 2 4 3 4
[373] 3 1 4 1 2 6 4 6 5 5 3 6 3 1 2 4 2 2 1 2 6 5 3 6 5 1 4 4 1 4 3 4 2 3 6 2 5 1 4 4 2 6 3 6 4 3 5 1 2 3 3 3 3 3 2 6 2 2 6 1 2 2
[435] 4 1 3 4 2 2 3 4 3 3 3 3 1 6 1 1 6 4 4 4 6 3 3 2 4 4 2 3 1 6 6 4 2 6 4 2 2 2 2 4 3 4 6 3 6 2 4 5 2 2 6 6 3 2 4 3 4 4 2 4 2 1
[497] 3 4 5 3 2 4 6 2 4 6 5 6 4 6 5 4 1 6 3 3 5 6 2 3 1 1 3 6 6 4 6 3 6 4 6 2 2 1 6 2 1 3 1 6 3 4 1 4 2 5 6 6 1 4 3 3 3 6 2 6 5 2
[559] 5 6 4 2 3 3 6 2 3 6 2 5 4 4 6 2 4 4 3 3 3 2 6 5 2 4 3 3 5 2 1 3 4 1 1 3 1 2 3 3 2 3 4 3 3 3 5 1 1 1 1 4 3 4 4 4 5 3 2 1 6 5
[621] 2 2 4 4 2 1 1 1 2 2 3 5 1 6 3 3 4 4 3 5 2 3 4 3 2 2 6 6 3 3 3 2 4 5 2 2 1 3 2 5 5 2 2 5 5 3 6 4 2 1 6 3 4 2 4 2 3 1 4 3 1 4
[683] 5 3 3 2 1 4 3 4 1 1 2 2 1 2 3 3 3 3 2 5 4 2 1 2 4 3 2 1 6 1 2 4 3 4 3 4 2 6 3 3 6 3 3 6 6 4 6 4 4 1 2 6 3 3 6 2 5 5 3 4 2 4
[745] 1 2 2 1 2 5 5 1 6 5 6 6 2 5 1 1 6 1 4 2 1 2 5 3 6 1 3 1 3 6 2 5 2 4 4 2 3 4 4 2 1 6 2 6 6 1 1 5 6 4 5 4 3 1 4 2 2 4 6 4 1 6
[807] 3 1 1 4 4 3 3 5 3 6 5 4 6 6 1 1 5 1 6 4 3 1 2 4 4 6 3 3 4 4 4 3 4 4 4 1 3 6 3 3 3 6 2 5 5 6 4 3 4 2 4 2 3 1 2 2 2 5 6 4 1 6
[869] 6 3 4 3 4 6 5 4 6 2 6 5 1 1 2 1 2 4 5 6 3 4 2 1 2 2 5 6 6 1 2 4 1 2 3 2 3 3 3 1 3 4 1 2 4 3 6 5 3 1 5 2 2 4 1 4 1 4 1 2 4 6
[931] 6 1 3 2 2 6 5 3 6 4 5 1 4 5 1 6 1 1 6 6 2 2 2 6 4 2 1 2 3 2 2 2 3 3 5 6 3 6 5 5 3 5 2 1 1 2 6 4 2 2 2 4 5 3 3 6 6 2 6 1 6 1
[993] 4 3 2 6 5 6 3 4
[ reached getOption("max.print") -- omitted 5497 entries ]
Within cluster sum of squares by cluster:
[1] 5121.289 4905.915 7942.310 5922.045 7546.492 4797.123
(between_SS / total_SS = 49.3 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
table(output_cluster$cluster)
1 2 3 4 5 6
946 1206 1483 1219 646 997
Visualizando o resultado de uma forma gráfica
plot(tkmeans(wines_padr , k = 6, alpha = 0.01))
Vamos tentar reduzir o número de variáveis e verificar o impacto que isso terá na clusterização.
acpcor_wines <- prcomp(wines_padr, scale = TRUE, retx = TRUE)
summary(acpcor_wines)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11
Standard deviation 1.7412 1.5791 1.2464 0.98545 0.84537 0.77894 0.72356 0.70943 0.5821 0.47810 0.18564
Proportion of Variance 0.2756 0.2267 0.1412 0.08828 0.06497 0.05516 0.04759 0.04575 0.0308 0.02078 0.00313
Cumulative Proportion 0.2756 0.5023 0.6435 0.73181 0.79678 0.85194 0.89953 0.94528 0.9761 0.99687 1.00000
plot(1:ncol(wines_padr), acpcor_wines$sdev^2, type = "b", xlab = "Componente",
ylab = "Variância", pch = 20, cex.axis = 0.8, cex.lab = 0.8)
Pelas informações acima, é possível verificar que ao optar pelos 6 primeiros componentes, obtém-se 85% da variância dos dados.
escore1 <- acpcor_wines$x[, 1]
escore2 <- acpcor_wines$x[, 2]
escore3 <- acpcor_wines$x[, 3]
escore4 <- acpcor_wines$x[, 4]
escore5 <- acpcor_wines$x[, 5]
escore6 <- acpcor_wines$x[, 6]
wines_cpa <-cbind(escore1, escore2, escore3, escore4, escore5, escore6)
wss <- (nrow(wines_cpa)-1)*sum(apply(wines_cpa,2,var))
for (i in 2:20) wss[i] <- sum(kmeans(wines_cpa,
centers=i)$withinss)
plot(1:20, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
set.seed(5425)
output_cluster_pca <- kmeans(wines_cpa, 6,iter=100)
output_cluster
K-means clustering with 6 clusters of sizes 1232, 930, 1422, 1128, 1130, 655
Cluster means:
escore1 escore2 escore3 escore4 escore5 escore6
1 -0.5561151 0.2304678 -0.5060747 0.40461549 -0.07829805 -0.54193526
2 2.5395077 -0.1482698 -1.4627894 -0.38520916 -0.12219321 0.13766453
3 -1.9186941 -1.5728764 -0.4141836 -0.09042038 0.19038205 0.18280241
4 -0.3889762 0.7653344 1.1073630 -0.66952847 -0.24478532 -0.02662016
5 -0.1498876 2.1033691 0.2640424 0.39209807 0.03840748 0.24111008
6 2.5342180 -1.7549915 1.5654563 0.45876841 0.26274350 0.05689213
Clustering vector:
[1] 1 5 6 1 3 2 2 2 5 3 3 1 1 1 4 5 2 4 2 1 1 3 2 1 4 4 1 6 2 3 5 2 6 1 2 1 1 3 4 3 3 3 1 6 5 3 4 3 4 4 4 5 1 1 2 5 4 1 2 2 5 3
[63] 4 4 2 4 4 6 5 3 1 4 3 2 4 6 6 5 5 3 5 5 6 6 6 2 5 3 2 5 6 1 4 5 3 5 3 4 5 5 2 4 5 4 1 2 4 4 4 5 3 4 4 1 4 3 5 1 1 5 4 1 6 3
[125] 1 3 1 1 2 2 1 6 1 5 3 2 2 3 4 3 3 3 5 3 4 5 6 3 2 2 1 4 4 5 2 1 3 3 6 3 3 1 3 6 5 4 2 4 1 4 2 2 3 3 5 1 4 2 1 3 4 2 1 2 4 1
[187] 3 1 2 3 1 2 3 4 6 5 5 3 5 3 1 5 3 1 3 1 3 5 6 3 2 5 5 6 6 1 3 5 4 3 3 3 4 1 3 2 2 4 5 4 2 5 4 3 2 1 3 2 6 3 6 2 2 3 5 1 1 1
[249] 3 5 4 1 5 5 4 3 6 3 4 3 3 2 2 5 2 5 5 5 4 3 5 4 3 3 2 6 4 3 6 2 5 3 6 2 1 6 5 6 3 4 3 1 5 5 2 1 2 3 6 1 3 3 2 6 1 3 5 2 2 3
[311] 4 2 6 2 5 3 2 1 2 2 1 1 5 3 1 5 4 2 2 3 6 4 3 3 6 2 4 3 6 3 5 6 4 3 5 1 2 6 4 5 4 4 4 5 1 4 3 5 5 1 4 5 3 5 5 2 5 3 5 4 3 4
[373] 3 2 1 2 5 5 4 1 6 6 3 1 3 2 5 4 5 5 2 4 1 6 3 1 6 2 4 1 2 4 3 4 5 3 1 5 6 2 4 4 5 1 3 5 4 3 6 2 5 3 3 3 1 3 5 1 5 5 1 2 5 5
[435] 3 2 3 4 5 5 3 4 3 3 3 3 2 1 2 2 1 1 1 4 1 3 3 5 4 4 5 1 2 1 5 4 5 1 4 5 4 5 5 1 3 4 1 3 1 5 4 6 5 5 1 1 1 5 4 3 1 1 5 1 4 2
[497] 3 4 6 3 5 4 1 5 4 1 6 1 1 1 6 1 2 1 3 3 2 5 5 3 2 2 3 1 1 4 1 1 5 1 1 4 5 2 1 4 2 3 2 1 3 4 2 1 5 6 1 1 2 4 1 3 3 1 5 1 6 4
[559] 6 1 4 5 3 3 1 5 3 1 4 6 4 4 1 4 4 1 3 3 3 5 1 6 4 4 3 1 6 5 2 3 4 2 6 3 2 5 3 3 5 3 1 3 3 3 6 2 2 2 2 4 3 4 4 1 6 3 5 2 1 6
[621] 4 5 3 4 5 2 5 2 5 5 3 6 2 1 3 3 4 1 3 6 5 3 4 3 5 5 1 1 3 3 3 5 4 6 4 5 2 1 4 6 6 4 5 6 6 1 1 4 4 2 1 3 4 4 4 5 3 2 4 4 2 1
[683] 6 3 3 4 2 4 3 4 2 2 5 5 2 4 3 3 3 3 4 6 1 4 2 4 4 3 5 2 1 2 5 4 3 4 3 4 5 1 3 3 1 3 3 1 5 4 1 4 4 2 5 1 3 3 1 5 6 6 3 4 5 1
[745] 2 5 4 2 5 6 6 2 1 6 5 1 4 6 2 2 1 2 1 5 2 5 6 1 1 2 3 2 3 1 5 6 5 4 3 5 3 4 1 4 2 1 5 1 1 2 2 6 1 4 6 4 1 2 1 5 5 4 1 4 2 5
[807] 3 1 2 4 4 3 3 6 3 1 6 4 1 1 2 2 6 2 1 1 3 2 5 4 5 5 3 3 1 4 4 3 4 1 4 2 3 1 3 3 3 1 4 6 6 5 1 3 4 4 4 5 3 2 4 4 5 6 1 4 2 1
[869] 1 3 4 3 1 1 6 1 1 5 2 6 2 2 5 2 4 4 6 1 3 4 4 4 5 5 6 1 1 2 5 4 2 4 3 5 3 3 3 2 3 1 2 5 4 1 1 6 3 2 6 4 5 4 2 4 2 1 2 5 4 1
[931] 1 2 3 4 4 1 6 3 5 4 6 2 4 6 2 1 2 2 1 1 5 5 5 1 4 5 2 5 3 5 4 5 3 3 6 1 3 1 6 6 3 6 5 2 2 5 1 1 5 5 5 4 6 3 3 5 1 5 1 2 1 2
[993] 1 3 4 1 6 1 3 4
[ reached getOption("max.print") -- omitted 5497 entries ]
Within cluster sum of squares by cluster:
[1] 4526.465 3837.918 4984.047 3541.824 3502.742 5717.291
(between_SS / total_SS = 57.1 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
table(output_cluster_pca$cluster)
1 2 3 4 5 6
1232 930 1422 1128 1130 655
plot(tkmeans(wines_cpa , k = 6, alpha = 0.01))
Agora vou cruzar os resultados obtidos a partir das variáveis com os resultados obitos pelos componentes principais.
cruzamento <- data.frame(cbind(output_cluster$cluster, output_cluster_pca$cluster))
colnames(cruzamento) <- c("sem_pca", "com_pca")
cruzamento %>%
mutate(sem_pca = as.factor(sem_pca), com_pca = as.factor(com_pca)) %>%
ggplot(aes(sem_pca, com_pca)) +
geom_count(colour = "blue") +
theme(legend.position = "bottom")
O count plot acima indica que, apesar de haver mudança na forma como os vinhos foram classificados, em sua maioria os agrupamentos continuaram parecidos.